{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e026861",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os, time, datetime, ast\n",
    "import re\n",
    "\n",
    "import requests\n",
    "import time\n",
    "import json\n",
    "\n",
    "os.chdir(\"/Users/xiaosongw/Dropbox/Research/InformedSources/Replication/Build\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e94490a5-eef5-40ea-a61e-31c73cb9a595",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tz2st = pd.read_csv(\"./Output/tz2st_out.csv\").rename(\n",
    "    columns={'tz':'tz0', 'duration':'dur0', 'distance':'dist0'})\n",
    "df_tz2st.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2991954-005d-4a75-828d-9597a0e01f0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_st2tz = pd.read_csv(\"./Output/st2tz_out.csv\").rename(\n",
    "    columns={'tz':'tz1', 'duration':'dur1', 'distance':'dist1'})\n",
    "df_st2tz.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a6b3884-4b4f-4d7f-b393-e04f5edee2f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tz2tz = pd.read_csv(\"./Output/tz2tz_out.csv\").rename(\n",
    "    columns={'tzn0':'tz0', 'tzn1':'tz1', \n",
    "             'duration':'dur', 'distance':'dist'})\n",
    "df_tz2tz['dur'] = df_tz2tz['dur'] / 60\n",
    "df_tz2tz.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b08f1e30",
   "metadata": {},
   "source": [
    "# routes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7618dba8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import multiprocessing as mp\n",
    "num_cores = mp.cpu_count()\n",
    "print(num_cores)\n",
    "\n",
    "def myfunc(indf):\n",
    "    dfout_ = pd.DataFrame()\n",
    "    for ix, ir in indf.iterrows():\n",
    "        df_ = df_tz2st[(df_tz2st['tz0']==ir['tz0'])&(df_tz2st['dur0']<ir['dur']+10)].merge(\n",
    "            df_st2tz[(df_st2tz['tz1']==ir['tz1'])&(df_st2tz['dur1']<ir['dur']+10)], on='id', how='inner')\n",
    "        df_['dur_tot'] = df_['dur0'] + df_['dur1']\n",
    "        df_['dist_tot'] = df_['dist0'] + df_['dist1']\n",
    "        df_['dur'] = ir['dur']\n",
    "        df_['dist'] = ir['dist']\n",
    "        df_['t'] = df_['dur_tot'] - df_['dur']\n",
    "        dfout_ = pd.concat([dfout_, df_[df_['t']<10]], axis=0)\n",
    "    return dfout_\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0afb67e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfin_split = np.array_split(df_tz2tz, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4c40528",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "pool = mp.Pool(num_cores)\n",
    "# parallel is 8 times faster\n",
    "for i in range(20):\n",
    "    df_split = np.array_split(dfin_split[i], num_cores)\n",
    "    df_tz2st2tz = pd.concat(pool.map(myfunc, df_split))\n",
    "    df_tz2st2tz.sort_values(['tz0', 'tz1', 't'], ignore_index=True, inplace=True)\n",
    "    df_tz2st2tz.to_csv(\"./Temp/tz2st2tz_out_{}.csv\".format(i), index=False)\n",
    "    str_out = '{} done! Time {} min\\n'.format(i, round((time.time()-start)/60, 1))\n",
    "    print(str_out)\n",
    "    with open('log.txt', 'a') as f:\n",
    "        f.write(str_out)\n",
    "pool.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08fb4ea5",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "l_files = [i for i in os.listdir('./Temp/') if 'tz2st2tz_out_' in i]\n",
    "df_out = pd.DataFrame()\n",
    "for i in l_files:\n",
    "    df_ = pd.read_csv('./Temp/'+i)\n",
    "    df_.rename(columns={'t':'t0'}, inplace=True)\n",
    "    df_['rank'] = df_.groupby(['tz0', 'tz1'])['t0'].rank(method='dense')\n",
    "    df_['t_min'] = df_.groupby(['tz0', 'tz1'])['t0'].transform('min')\n",
    "    df_.loc[df_['t_min']>=0, 't_min'] = 0\n",
    "    df_['t'] = df_['t0'] + abs(df_['t_min'])\n",
    "    print('-----\\nfile {}: \\n'.format(i.split('_')[-1]), df_[['t0', 't']].describe().transpose().round(2))\n",
    "    df_ = df_[(df_['t']<5)&(df_['rank']<=15)]\n",
    "    df_out = pd.concat([df_out, df_[['tz0', 'id', 'tz1', 't', 'rank']]], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9a54c5c-640c-4319-9de3-09d0a34ddab6",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in l_files:\n",
    "    os.remove(\"./Temp/\"+i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc0cf3d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb50e3b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out.to_csv(\"./Output/tz2st2tz_out.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b08bd60a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f161dd5-a606-430b-8191-3e3c286a5f90",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
